In [1]:
#Importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
import os
import matplotlib.pyplot as plt#visualization
from PIL import  Image
%matplotlib inline
import pandas as pd
import seaborn as sns#visualization
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization
import plotly.express as px

start_time = pd.datetime.now()
In [3]:
data = pd.read_csv("train.csv")
In [4]:
#first few rows
data.head()
Out[4]:
id log_price property_type room_type amenities accommodates bathrooms bed_type cancellation_policy cleaning_fee ... latitude longitude name neighbourhood number_of_reviews review_scores_rating thumbnail_url zipcode bedrooms beds
0 6901257 5.010635 Apartment Entire home/apt {"Wireless Internet","Air conditioning",Kitche... 3 1.0 Real Bed strict True ... 40.696524 -73.991617 Beautiful brownstone 1-bedroom Brooklyn Heights 2 100.0 https://a0.muscache.com/im/pictures/6d7cbbf7-c... 11201 1.0 1.0
1 6304928 5.129899 Apartment Entire home/apt {"Wireless Internet","Air conditioning",Kitche... 7 1.0 Real Bed strict True ... 40.766115 -73.989040 Superb 3BR Apt Located Near Times Square Hell's Kitchen 6 93.0 https://a0.muscache.com/im/pictures/348a55fe-4... 10019 3.0 3.0
2 7919400 4.976734 Apartment Entire home/apt {TV,"Cable TV","Wireless Internet","Air condit... 5 1.0 Real Bed moderate True ... 40.808110 -73.943756 The Garden Oasis Harlem 10 92.0 https://a0.muscache.com/im/pictures/6fae5362-9... 10027 1.0 3.0
3 13418779 6.620073 House Entire home/apt {TV,"Cable TV",Internet,"Wireless Internet",Ki... 4 1.0 Real Bed flexible True ... 37.772004 -122.431619 Beautiful Flat in the Heart of SF! Lower Haight 0 NaN https://a0.muscache.com/im/pictures/72208dad-9... 94117.0 2.0 2.0
4 3808709 4.744932 Apartment Entire home/apt {TV,Internet,"Wireless Internet","Air conditio... 2 1.0 Real Bed moderate True ... 38.925627 -77.034596 Great studio in midtown DC Columbia Heights 4 40.0 NaN 20009 0.0 1.0

5 rows × 29 columns

Data Overview

In [6]:
print ("Rows     : " ,data.shape[0])
print ("Columns  : " ,data.shape[1])
print ("\nFeatures : \n" ,data.columns.tolist())
print ("\nMissing values :  ", data.isnull().sum().values.sum())
print ("\nUnique values :  \n",data.nunique())
Rows     :  74111
Columns  :  29

Features : 
 ['id', 'log_price', 'property_type', 'room_type', 'amenities', 'accommodates', 'bathrooms', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city', 'description', 'first_review', 'host_has_profile_pic', 'host_identity_verified', 'host_response_rate', 'host_since', 'instant_bookable', 'last_review', 'latitude', 'longitude', 'name', 'neighbourhood', 'number_of_reviews', 'review_scores_rating', 'thumbnail_url', 'zipcode', 'bedrooms', 'beds']

Missing values :   83752

Unique values :  
 id                        74111
log_price                   767
property_type                35
room_type                     3
amenities                 67122
accommodates                 16
bathrooms                    17
bed_type                      5
cancellation_policy           5
cleaning_fee                  2
city                          6
description               73479
first_review               2554
host_has_profile_pic          2
host_identity_verified        2
host_response_rate           80
host_since                 3087
instant_bookable              2
last_review                1371
latitude                  74111
longitude                 74111
name                      73359
neighbourhood               619
number_of_reviews           371
review_scores_rating         54
thumbnail_url             65883
zipcode                     769
bedrooms                     11
beds                         18
dtype: int64
In [7]:
##missing value percentage
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'column_name': data.columns,
                                 'percent_missing': percent_missing})
missing_value_df 
Out[7]:
column_name percent_missing
id id 0.000000
log_price log_price 0.000000
property_type property_type 0.000000
room_type room_type 0.000000
amenities amenities 0.000000
accommodates accommodates 0.000000
bathrooms bathrooms 0.269865
bed_type bed_type 0.000000
cancellation_policy cancellation_policy 0.000000
cleaning_fee cleaning_fee 0.000000
city city 0.000000
description description 0.000000
first_review first_review 21.405729
host_has_profile_pic host_has_profile_pic 0.253674
host_identity_verified host_identity_verified 0.253674
host_response_rate host_response_rate 24.691341
host_since host_since 0.253674
instant_bookable instant_bookable 0.000000
last_review last_review 21.355804
latitude latitude 0.000000
longitude longitude 0.000000
name name 0.000000
neighbourhood neighbourhood 9.272578
number_of_reviews number_of_reviews 0.000000
review_scores_rating review_scores_rating 22.563452
thumbnail_url thumbnail_url 11.086074
zipcode zipcode 1.303450
bedrooms bedrooms 0.122789
beds beds 0.176762
In [11]:
#data type for each columns
data.dtypes
Out[11]:
id                          int64
log_price                 float64
property_type              object
room_type                  object
amenities                  object
accommodates                int64
bathrooms                 float64
bed_type                   object
cancellation_policy        object
cleaning_fee                 bool
city                       object
description                object
first_review               object
host_has_profile_pic       object
host_identity_verified     object
host_response_rate         object
host_since                 object
instant_bookable           object
last_review                object
latitude                  float64
longitude                 float64
name                       object
neighbourhood              object
number_of_reviews           int64
review_scores_rating      float64
thumbnail_url              object
zipcode                    object
bedrooms                  float64
beds                      float64
dtype: object
In [14]:
#data summary for numeric fields
data.describe()
Out[14]:
id log_price accommodates bathrooms latitude longitude number_of_reviews review_scores_rating bedrooms beds
count 7.411100e+04 74111.000000 74111.000000 73911.000000 74111.000000 74111.000000 74111.000000 57389.000000 74020.000000 73980.000000
mean 1.126662e+07 4.782069 3.155146 1.235263 38.445958 -92.397525 20.900568 94.067365 1.265793 1.710868
std 6.081735e+06 0.717394 2.153589 0.582044 3.080167 21.705322 37.828641 7.836556 0.852143 1.254142
min 3.440000e+02 0.000000 1.000000 0.000000 33.338905 -122.511500 0.000000 20.000000 0.000000 0.000000
25% 6.261964e+06 4.317488 2.000000 1.000000 34.127908 -118.342374 1.000000 92.000000 1.000000 1.000000
50% 1.225415e+07 4.709530 2.000000 1.000000 40.662138 -76.996965 6.000000 96.000000 1.000000 1.000000
75% 1.640226e+07 5.220356 4.000000 1.000000 40.746096 -73.954660 23.000000 100.000000 1.000000 2.000000
max 2.123090e+07 7.600402 16.000000 8.000000 42.390437 -70.985047 605.000000 100.000000 10.000000 18.000000

Exploratory Data Analysis

In [15]:
data['id'].nunique()
Out[15]:
74111
In [80]:
#define a function to plot interactive distrbution graph
def distribution_plot(dataset,column,title,xtitle,ytitle):
    trace = go.Histogram(x=dataset[column], opacity=0.7, marker={"line": {"color": "#25232C"}})
    layout = go.Layout(title=title, xaxis={"title": xtitle, "showgrid": False},
                       yaxis={"title": ytitle, "showgrid": False},plot_bgcolor='rgba(0,0,0,0)',
                      paper_bgcolor='rgba(0,0,0,0)') #showgrid:False to remove gridline
    figure = {"data": [trace], "layout": layout}

    py.iplot(figure)
    
In [81]:
distribution_plot(data,'log_price',f"Log Price Distribution","log_price","Count")
In [76]:
data['property_type'].value_counts()
Out[76]:
Apartment             49003
House                 16511
Condominium            2658
Townhouse              1692
Loft                   1244
Other                   607
Guesthouse              498
Bed & Breakfast         462
Bungalow                366
Villa                   179
Dorm                    142
Guest suite             123
Camper/RV                94
Timeshare                77
Cabin                    72
In-law                   71
Hostel                   70
Boutique hotel           69
Boat                     65
Serviced apartment       21
Tent                     18
Castle                   13
Vacation home            11
Yurt                      9
Hut                       8
Treehouse                 7
Chalet                    6
Earth House               4
Tipi                      3
Cave                      2
Train                     2
Lighthouse                1
Parking Space             1
Casa particular           1
Island                    1
Name: property_type, dtype: int64
In [100]:
#property type distribution
distribution_plot(data,'property_type',f"Property Type Distribution","Property Type", "Count")
#room type distribution
distribution_plot(data,'room_type',f"Room Type Distribution","Room Type", "Count")
#accomodates
distribution_plot(data,'accommodates',f"Accommodates Distribution","Accommodates", "Count")
In [102]:
#bathrooms
distribution_plot(data,'bathrooms',f"Bathroom Distribution","Bathrooms", "Count")
#bed type
distribution_plot(data,'bed_type',f"Bed Type Distribution","Bed Type", "Count")
#cancellation_policy   
distribution_plot(data,'cancellation_policy',f"Cancellation Policy Distribution","Cancellation Policy", "Count")
In [103]:
#cleaning_fee  
distribution_plot(data,'cleaning_fee',f"Cleaning Fee Distribution","Cleaning Fee", "Count")
#city 
distribution_plot(data,'city',f"City Distribution","City", "Count")
In [105]:
#host_has_profile_pic   
distribution_plot(data,'host_has_profile_pic',f"Host Profile Pic Distribution","Host Profile(Yes/No)", "Count")
#host_identity_verified
distribution_plot(data,'host_identity_verified',f"Host Indentity Disrtibution Distribution","Host Identification(Yes/No)", "Count")
#host_response_rate  
distribution_plot(data,'host_response_rate',f"Host Response Rate Distribution","Host Response Rate", "Count")
In [128]:
#host_has_profile_pic   
distribution_plot(data,'instant_bookable',f"Instant Bookable Distribution","Instant Bookable(Yes/No)", "Count")
#number_of_reviews
distribution_plot(data,'number_of_reviews',f"Number of Review Distribution","Number of Review", "Count")
#review_scores_rating 
distribution_plot(data,'review_scores_rating',f"Review Rating Distribution","Review Rating", "Count")
In [150]:
#the median of the review rating is 96
data['review_scores_rating'].median()
Out[150]:
96.0
In [129]:
#bedrooms
distribution_plot(data,'bedrooms',f"Number of Bedroom Distribution","Number of Bedroom", "Count")
#beds
distribution_plot(data,'beds',f"Number of Bed Distribution","Number of Bed", "Count")
In [111]:
#convert the column to datetime data type
data['host_since'] = pd.to_datetime(data['host_since'], infer_datetime_format=True)
In [120]:
host_since = data['host_since'].value_counts().reset_index()
host_since = host_since.sort_values('index')
host_since.head()
Out[120]:
index host_since
2961 2008-03-03 1
3013 2008-03-04 1
2999 2008-04-21 1
3081 2008-06-27 1
2938 2008-07-11 1
In [122]:
fig = px.line(host_since, x='index', y='host_since')
fig.update_layout(title='Year Host Started Distribution', xaxis={"title": 'Year', "showgrid": False},
                       yaxis={"title": 'Count', "showgrid": False},plot_bgcolor='rgba(0,0,0,0)',
                      paper_bgcolor='rgba(0,0,0,0)')
fig.show()
In [125]:
#top 5 names
data['name'].value_counts().head()
Out[125]:
Bunk bed in the Treat Street Clubhouse    8
East Village Studio                       7
Your home away from home                  7
Location, Location, Location              6
Spacious Private Room in Brooklyn         6
Name: name, dtype: int64
In [126]:
#top 5 neighborhood
data['neighbourhood'].value_counts().head()
Out[126]:
Williamsburg          2862
Bedford-Stuyvesant    2166
Bushwick              1601
Upper West Side       1396
Mid-Wilshire          1392
Name: neighbourhood, dtype: int64
In [ ]:
#host_identity_verified
distribution_plot(data,'instant_bookable',f"Host Indentity Disrtibution Distribution","Host Identification(Yes/No)", "Count")
#host_response_rate  
distribution_plot(data,'host_response_rate',f"Host Response Rate Distribution","Host Response Rate", "Count")
In [91]:
data['amenities'].value_counts()
Out[91]:
{}                                                                                                                                                                                                                                                                                                                               586
{"translation missing: en.hosting_amenity_49","translation missing: en.hosting_amenity_50"}                                                                                                                                                                                                                                      135
{"Family/kid friendly"}                                                                                                                                                                                                                                                                                                          103
{"Pets allowed","Family/kid friendly"}                                                                                                                                                                                                                                                                                            27
{TV,"Cable TV",Internet,"Wireless Internet","Air conditioning",Kitchen,"Pets allowed",Doorman,Gym,Elevator,Heating,"Family/kid friendly",Washer,Dryer,"Smoke detector","Carbon monoxide detector",Essentials,Shampoo,"24-hour check-in",Hangers,"Hair dryer",Iron,"Laptop friendly workspace","Self Check-In",Doorman}            26
                                                                                                                                                                                                                                                                                                                                ... 
{TV,"Wireless Internet","Air conditioning",Kitchen,"Free parking on premises",Breakfast,Heating,"Family/kid friendly",Washer,Dryer,"Smoke detector","Carbon monoxide detector","Safety card",Essentials,Shampoo,"Lock on bedroom door",Hangers,"Laptop friendly workspace"}                                                        1
{TV,"Cable TV","Wireless Internet","Air conditioning",Kitchen,Heating,"Family/kid friendly","Smoke detector","Carbon monoxide detector","First aid kit","Safety card","Fire extinguisher",Essentials,Hangers,"Hair dryer",Iron,"Self Check-In",Lockbox,"Private entrance","Pack ’n Play/travel crib","Room-darkening shades"}      1
{Internet,"Wireless Internet","Air conditioning",Kitchen,"Buzzer/wireless intercom","Smoke detector",Essentials,"Laptop friendly workspace","translation missing: en.hosting_amenity_50"}                                                                                                                                          1
{TV,"Cable TV",Internet,"Wireless Internet","Air conditioning",Kitchen,Heating,"Family/kid friendly","Smoke detector","Carbon monoxide detector",Hangers,"Hair dryer",Iron}                                                                                                                                                        1
{TV,"Cable TV",Internet,"Wireless Internet","Wheelchair accessible",Kitchen,Elevator,Heating,Washer,Dryer,"Smoke detector","Carbon monoxide detector","First aid kit","Fire extinguisher",Essentials,Shampoo,"translation missing: en.hosting_amenity_49","translation missing: en.hosting_amenity_50"}                            1
Name: amenities, Length: 67122, dtype: int64
In [95]:
data['cancellation_policy'].value_counts()
Out[95]:
strict             32374
flexible           22545
moderate           19063
super_strict_30      112
super_strict_60       17
Name: cancellation_policy, dtype: int64
In [141]:
#price distribution on US map
fig = px.scatter_mapbox(data, lat="latitude", lon="longitude", hover_data=["log_price"],color="log_price",
                  color_discrete_sequence=["fuchsia"], zoom=3, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
In [148]:
#number of bed distribution on US map
fig = px.scatter_mapbox(data, lat="latitude", lon="longitude", hover_data=["property_type"],color="beds",
                  color_discrete_sequence=["fuchsia"], zoom=4, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
In [149]:
#rating disrtbuition on US map
fig = px.scatter_mapbox(data, lat="latitude", lon="longitude", hover_data=["property_type"],color="review_scores_rating",
                  color_discrete_sequence=["fuchsia"], zoom=4, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

NLP Analysis on Amentities

In [155]:
Amen = pd.DataFrame(data['amenities'])
Amen.head()
Out[155]:
amenities
0 {"Wireless Internet","Air conditioning",Kitche...
1 {"Wireless Internet","Air conditioning",Kitche...
2 {TV,"Cable TV","Wireless Internet","Air condit...
3 {TV,"Cable TV",Internet,"Wireless Internet",Ki...
4 {TV,Internet,"Wireless Internet","Air conditio...
In [158]:
#remove the signs
Amen["amenities"] = Amen["amenities"].str.replace("{", "")
Amen["amenities"] = Amen["amenities"].str.replace("}", "")
Amen["amenities"] = Amen["amenities"].str.replace("\"", "")
In [160]:
#turn the description to list by splitting them using ","
Amen["amenities"] = Amen["amenities"].str.split(pat = ",")
In [161]:
Amen.head()
Out[161]:
amenities
0 [Wireless Internet, Air conditioning, Kitchen,...
1 [Wireless Internet, Air conditioning, Kitchen,...
2 [TV, Cable TV, Wireless Internet, Air conditio...
3 [TV, Cable TV, Internet, Wireless Internet, Ki...
4 [TV, Internet, Wireless Internet, Air conditio...
In [167]:
#Vectorize the item in the list by count
from sklearn.feature_extraction.text import CountVectorizer
inp = ["<some_space>".join(x) for x in Amen["amenities"]]
vectorizer = CountVectorizer(tokenizer = lambda x: x.split("<some_space>"), analyzer="word")

vector = vectorizer.fit_transform(inp)
In [165]:
print(vectorizer.get_feature_names())
['', ' smooth pathway to front door', '24-hour check-in', 'accessible-height bed', 'accessible-height toilet', 'air conditioning', 'air purifier', 'baby bath', 'baby monitor', 'babysitter recommendations', 'bath towel', 'bathtub', 'bathtub with shower chair', 'bbq grill', 'beach essentials', 'beachfront', 'bed linens', 'body soap', 'breakfast', 'buzzer/wireless intercom', 'cable tv', 'carbon monoxide detector', 'cat(s)', 'changing table', 'children’s books and toys', 'children’s dinnerware', 'cleaning before checkout', 'coffee maker', 'cooking basics', 'crib', 'disabled parking spot', 'dishes and silverware', 'dishwasher', 'dog(s)', 'doorman', 'doorman entry', 'dryer', 'elevator', 'elevator in building', 'essentials', 'ethernet connection', 'ev charger', 'extra pillows and blankets', 'family/kid friendly', 'fire extinguisher', 'fireplace guards', 'firm matress', 'firm mattress', 'first aid kit', 'fixed grab bars for shower & toilet', 'flat', 'flat smooth pathway to front door', 'free parking on premises', 'free parking on street', 'game console', 'garden or backyard', 'grab-rails for shower and toilet', 'ground floor access', 'gym', 'hair dryer', 'hand or paper towel', 'hand soap', 'handheld shower head', 'hangers', 'heating', 'high chair', 'host greets you', 'hot tub', 'hot water', 'hot water kettle', 'indoor fireplace', 'internet', 'iron', 'keypad', 'kitchen', 'lake access', 'laptop friendly workspace', 'lock on bedroom door', 'lockbox', 'long term stays allowed', 'luggage dropoff allowed', 'microwave', 'other', 'other pet(s)', 'outlet covers', 'oven', 'pack ’n play/travel crib', 'paid parking off premises', 'path to entrance lit at night', 'patio or balcony', 'pets allowed', 'pets live on this property', 'pocket wifi', 'pool', 'private bathroom', 'private entrance', 'private living room', 'refrigerator', 'roll-in shower with chair', 'room-darkening shades', 'safety card', 'self check-in', 'shampoo', 'single level home', 'ski in/ski out', 'smart lock', 'smartlock', 'smoke detector', 'smoking allowed', 'stair gates', 'step-free access', 'stove', 'suitable for events', 'table corner guards', 'toilet paper', 'translation missing: en.hosting_amenity_49', 'translation missing: en.hosting_amenity_50', 'tv', 'washer', 'washer / dryer', 'waterfront', 'well-lit path to entrance', 'wheelchair accessible', 'wide clearance to bed', 'wide clearance to shower & toilet', 'wide clearance to shower and toilet', 'wide doorway', 'wide entryway', 'wide hallway clearance', 'window guards', 'wireless internet']
In [168]:
Amen_df = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names())
Amen_df.head()
Out[168]:
smooth pathway to front door 24-hour check-in accessible-height bed accessible-height toilet air conditioning air purifier baby bath baby monitor babysitter recommendations ... well-lit path to entrance wheelchair accessible wide clearance to bed wide clearance to shower & toilet wide clearance to shower and toilet wide doorway wide entryway wide hallway clearance window guards wireless internet
0 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
1 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
4 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 131 columns

In [180]:
count_list = vector.toarray().sum(axis=0)
amen_dict = dict(zip(vectorizer.get_feature_names(),count_list))
del amen_dict[''] #remove empty value
amen_dict
Out[180]:
{' smooth pathway to front door': 397,
 '24-hour check-in': 19015,
 'accessible-height bed': 349,
 'accessible-height toilet': 261,
 'air conditioning': 55210,
 'air purifier': 14,
 'baby bath': 305,
 'baby monitor': 133,
 'babysitter recommendations': 677,
 'bath towel': 1,
 'bathtub': 3761,
 'bathtub with shower chair': 34,
 'bbq grill': 633,
 'beach essentials': 102,
 'beachfront': 33,
 'bed linens': 4178,
 'body soap': 1,
 'breakfast': 8306,
 'buzzer/wireless intercom': 17033,
 'cable tv': 24253,
 'carbon monoxide detector': 47190,
 'cat(s)': 3593,
 'changing table': 225,
 'children’s books and toys': 1126,
 'children’s dinnerware': 635,
 'cleaning before checkout': 271,
 'coffee maker': 3555,
 'cooking basics': 3958,
 'crib': 442,
 'disabled parking spot': 34,
 'dishes and silverware': 4255,
 'dishwasher': 2312,
 'dog(s)': 5255,
 'doorman': 4780,
 'doorman entry': 453,
 'dryer': 42711,
 'elevator': 10820,
 'elevator in building': 6417,
 'essentials': 64005,
 'ethernet connection': 693,
 'ev charger': 49,
 'extra pillows and blankets': 3026,
 'family/kid friendly': 37026,
 'fire extinguisher': 30724,
 'fireplace guards': 219,
 'firm matress': 15,
 'firm mattress': 115,
 'first aid kit': 27532,
 'fixed grab bars for shower & toilet': 68,
 'flat': 397,
 'flat smooth pathway to front door': 11,
 'free parking on premises': 23639,
 'free parking on street': 77,
 'game console': 479,
 'garden or backyard': 1129,
 'grab-rails for shower and toilet': 2,
 'ground floor access': 25,
 'gym': 7491,
 'hair dryer': 43330,
 'hand or paper towel': 1,
 'hand soap': 1,
 'handheld shower head': 136,
 'hangers': 49173,
 'heating': 67073,
 'high chair': 635,
 'host greets you': 1196,
 'hot tub': 6330,
 'hot water': 4267,
 'hot water kettle': 173,
 'indoor fireplace': 9300,
 'internet': 44648,
 'iron': 41687,
 'keypad': 2995,
 'kitchen': 67526,
 'lake access': 18,
 'laptop friendly workspace': 43703,
 'lock on bedroom door': 17983,
 'lockbox': 5738,
 'long term stays allowed': 1671,
 'luggage dropoff allowed': 1640,
 'microwave': 3912,
 'other': 218,
 'other pet(s)': 378,
 'outlet covers': 446,
 'oven': 3944,
 'pack ’n play/travel crib': 1124,
 'paid parking off premises': 7,
 'path to entrance lit at night': 27,
 'patio or balcony': 1220,
 'pets allowed': 10197,
 'pets live on this property': 9730,
 'pocket wifi': 217,
 'pool': 6283,
 'private bathroom': 74,
 'private entrance': 7270,
 'private living room': 2524,
 'refrigerator': 4650,
 'roll-in shower with chair': 3,
 'room-darkening shades': 1496,
 'safety card': 11513,
 'self check-in': 11041,
 'shampoo': 49465,
 'single level home': 519,
 'ski in/ski out': 13,
 'smart lock': 657,
 'smartlock': 787,
 'smoke detector': 61727,
 'smoking allowed': 3673,
 'stair gates': 299,
 'step-free access': 2060,
 'stove': 4083,
 'suitable for events': 4268,
 'table corner guards': 88,
 'toilet paper': 1,
 'translation missing: en.hosting_amenity_49': 20427,
 'translation missing: en.hosting_amenity_50': 25291,
 'tv': 52458,
 'washer': 43169,
 'washer / dryer': 31,
 'waterfront': 115,
 'well-lit path to entrance': 770,
 'wheelchair accessible': 4848,
 'wide clearance to bed': 406,
 'wide clearance to shower & toilet': 123,
 'wide clearance to shower and toilet': 1,
 'wide doorway': 1062,
 'wide entryway': 302,
 'wide hallway clearance': 475,
 'window guards': 615,
 'wireless internet': 71265}
In [182]:
amen_dict = sorted(amen_dict.items(), key=lambda x: x[1])
amen_dict
Out[182]:
[('bath towel', 1),
 ('body soap', 1),
 ('hand or paper towel', 1),
 ('hand soap', 1),
 ('toilet paper', 1),
 ('wide clearance to shower and toilet', 1),
 ('grab-rails for shower and toilet', 2),
 ('roll-in shower with chair', 3),
 ('paid parking off premises', 7),
 ('flat smooth pathway to front door', 11),
 ('ski in/ski out', 13),
 ('air purifier', 14),
 ('firm matress', 15),
 ('lake access', 18),
 ('ground floor access', 25),
 ('path to entrance lit at night', 27),
 ('washer / dryer', 31),
 ('beachfront', 33),
 ('bathtub with shower chair', 34),
 ('disabled parking spot', 34),
 ('ev charger', 49),
 ('fixed grab bars for shower & toilet', 68),
 ('private bathroom', 74),
 ('free parking on street', 77),
 ('table corner guards', 88),
 ('beach essentials', 102),
 ('firm mattress', 115),
 ('waterfront', 115),
 ('wide clearance to shower & toilet', 123),
 ('baby monitor', 133),
 ('handheld shower head', 136),
 ('hot water kettle', 173),
 ('pocket wifi', 217),
 ('other', 218),
 ('fireplace guards', 219),
 ('changing table', 225),
 ('accessible-height toilet', 261),
 ('cleaning before checkout', 271),
 ('stair gates', 299),
 ('wide entryway', 302),
 ('baby bath', 305),
 ('accessible-height bed', 349),
 ('other pet(s)', 378),
 (' smooth pathway to front door', 397),
 ('flat', 397),
 ('wide clearance to bed', 406),
 ('crib', 442),
 ('outlet covers', 446),
 ('doorman entry', 453),
 ('wide hallway clearance', 475),
 ('game console', 479),
 ('single level home', 519),
 ('window guards', 615),
 ('bbq grill', 633),
 ('children’s dinnerware', 635),
 ('high chair', 635),
 ('smart lock', 657),
 ('babysitter recommendations', 677),
 ('ethernet connection', 693),
 ('well-lit path to entrance', 770),
 ('smartlock', 787),
 ('wide doorway', 1062),
 ('pack ’n play/travel crib', 1124),
 ('children’s books and toys', 1126),
 ('garden or backyard', 1129),
 ('host greets you', 1196),
 ('patio or balcony', 1220),
 ('room-darkening shades', 1496),
 ('luggage dropoff allowed', 1640),
 ('long term stays allowed', 1671),
 ('step-free access', 2060),
 ('dishwasher', 2312),
 ('private living room', 2524),
 ('keypad', 2995),
 ('extra pillows and blankets', 3026),
 ('coffee maker', 3555),
 ('cat(s)', 3593),
 ('smoking allowed', 3673),
 ('bathtub', 3761),
 ('microwave', 3912),
 ('oven', 3944),
 ('cooking basics', 3958),
 ('stove', 4083),
 ('bed linens', 4178),
 ('dishes and silverware', 4255),
 ('hot water', 4267),
 ('suitable for events', 4268),
 ('refrigerator', 4650),
 ('doorman', 4780),
 ('wheelchair accessible', 4848),
 ('dog(s)', 5255),
 ('lockbox', 5738),
 ('pool', 6283),
 ('hot tub', 6330),
 ('elevator in building', 6417),
 ('private entrance', 7270),
 ('gym', 7491),
 ('breakfast', 8306),
 ('indoor fireplace', 9300),
 ('pets live on this property', 9730),
 ('pets allowed', 10197),
 ('elevator', 10820),
 ('self check-in', 11041),
 ('safety card', 11513),
 ('buzzer/wireless intercom', 17033),
 ('lock on bedroom door', 17983),
 ('24-hour check-in', 19015),
 ('translation missing: en.hosting_amenity_49', 20427),
 ('free parking on premises', 23639),
 ('cable tv', 24253),
 ('translation missing: en.hosting_amenity_50', 25291),
 ('first aid kit', 27532),
 ('fire extinguisher', 30724),
 ('family/kid friendly', 37026),
 ('iron', 41687),
 ('dryer', 42711),
 ('washer', 43169),
 ('hair dryer', 43330),
 ('laptop friendly workspace', 43703),
 ('internet', 44648),
 ('carbon monoxide detector', 47190),
 ('hangers', 49173),
 ('shampoo', 49465),
 ('tv', 52458),
 ('air conditioning', 55210),
 ('smoke detector', 61727),
 ('essentials', 64005),
 ('heating', 67073),
 ('kitchen', 67526),
 ('wireless internet', 71265)]
In [175]:
#generate word cloud
from PIL import Image
from wordcloud import WordCloud

wc = WordCloud(background_color="white",width=1000,height=1000, max_words=10,relative_scaling=0.5,normalize_plurals=False).generate_from_frequencies(amen_dict)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()